#IMPORT (wrangled) data 
raw_data <- read_csv("data/wrangled_utterances_representations.csv") 

#WRANGLE into DF of utterance-insights [eg not unique utterances, 1 obs for each utterance_detail-code]
df_insights <- raw_data %>% 
  #rename and factorize columns
  mutate(
    sid = factor(UID), #NOT actually a unique utterance id, treat as sheet order id
    pid = factor(PID, levels = c( #define level order so happiness first
          "bjs827ee1u", "3r2sh20ei", "4728sjuiz", "7ACC0B75","92ghd48xe","iurmer28", "s294hoei", #HAPPINESS-FIRST
          "j2719eertu2","lkin27js09b","li832lin23","7382kwtue","E1D39056","8v892iige")),   #SPACE-FIRST
    
    utterance = Utterance,
    uid = factor(as.numeric(factor(paste(pid,factor(utterance))))), #construct a unique ID for utterances
    
    TASK = factor(recode(Condition, "Static"="static", "Interactive"="ixn" )),
    TASK = factor(TASK, levels = c("static", "ixn")), #reorder factor levels
    DATASET = factor(recode(Notebook, "Happiness"="happiness", "Space"="space")), #cleanup diff case
    outcomeType = recode(DATASET, "happiness"="numeric", "space"="nominal"),
    data_order = factor(paste(TASK,"_",DATASET)), #create an order var 
    data_order = recode(data_order, "ixn _ happiness"="space-first",
                                  "ixn _ space"="happiness-first",
                                  "NA _ NA"="NA",
                                  "static _ happiness"="happiness-first",
                                  "static _ space"="space-first"),
    top_code = factor(highlevel),
    #recode process 
    top_code = recode(top_code, "ANALYSIS PROCESS" = "PROCESS"),
    top_code = factor(top_code, levels = c("PROCESS","DATASET","VARIABLE","RELATIONSHIP")),
    mid_code = factor(`Data Type`),
    low_code = factor(UtteranceType),
    timestamp = Timestamp,
    repns = group,
    ixn = factor(interaction_used) #was interaction used?
  ) %>% select( #select only needed columns
    sid,uid,pid, TASK, DATASET, data_order, ixn, top_code, mid_code, low_code, repns, timestamp, utterance
  )

#DF OF UNIQUE UTTERANCES
df_uniques <- df_insights %>% select(uid, pid, TASK, DATASET) %>% 
  distinct()  #take only unique utterances

print("DF of utterances — 1 row per utterance_detail-code")
## [1] "DF of utterances — 1 row per utterance_detail-code"
glimpse(df_insights)
## Rows: 743
## Columns: 13
## $ sid        <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ uid        <fct> 339, 351, 393, 369, 343, 366, 341, 397, 363, 377, 396, 371,…
## $ pid        <fct> j2719eertu2, j2719eertu2, j2719eertu2, j2719eertu2, j2719ee…
## $ TASK       <fct> static, static, static, static, static, static, static, sta…
## $ DATASET    <fct> space, space, space, space, space, space, space, space, spa…
## $ data_order <fct> space-first, space-first, space-first, space-first, space-f…
## $ ixn        <fct> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ top_code   <fct> DATASET, DATASET, DATASET, VARIABLE, DATASET, VARIABLE, VAR…
## $ mid_code   <fct> NA, NA, NA, distribution (categorical), NA, distribution (c…
## $ low_code   <fct> "data orientation", "data orientation", "data size", "distr…
## $ repns      <chr> "dataframe", "dataframe", "profile", "profile", "profile", …
## $ timestamp  <time> 19:57:00, 20:10:00, 21:27:00, 21:29:00, 21:38:00, 21:44:00…
## $ utterance  <chr> "\"Alright, so every row is the passenger, their home plane…
print("DF of unique utterances — 1 row per utterance [no codes]")
## [1] "DF of unique utterances — 1 row per utterance [no codes]"
glimpse(df_uniques)
## Rows: 662
## Columns: 4
## $ uid     <fct> 339, 351, 393, 369, 343, 366, 341, 397, 363, 377, 396, 371, 40…
## $ pid     <fct> j2719eertu2, j2719eertu2, j2719eertu2, j2719eertu2, j2719eertu…
## $ TASK    <fct> static, static, static, static, static, static, static, static…
## $ DATASET <fct> space, space, space, space, space, space, space, space, space,…

1 DATA PROFILE

TODO TALK WITH DYLAN - resolve missing data in TASK, outcomeType, timestamp… - are these the result of ‘exploded’ utterances that were dual coded? <– need to carry the other attributes across both obs - max of 2 detail-codes applied, correct? - where are the flag codes?

df_insights %>% summarytools::dfSummary(
             plain.ascii  = FALSE,
             graph.magnif = 0.75,
             style        = "grid",
             tmp.img.dir  = "temp",
             missing.col = FALSE, 
             method = "render"
)

1.0.1 Data Frame Summary

1.0.1.1 df_insights

Dimensions: 743 x 13
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 sid
[factor]
1. 0
2. 1
3. 2
4. 3
5. 4
6. 5
7. 6
8. 7
9. 8
10. 9
[ 733 others ]
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
733 (98.7%)
743
(100.0%)
0
(0.0%)
2 uid
[factor]
1. 1
2. 2
3. 3
4. 4
5. 5
6. 6
7. 7
8. 8
9. 9
10. 10
[ 652 others ]
2 ( 0.3%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
1 ( 0.1%)
2 ( 0.3%)
1 ( 0.1%)
1 ( 0.1%)
731 (98.4%)
743
(100.0%)
0
(0.0%)
3 pid
[factor]
1. bjs827ee1u
2. 3r2sh20ei
3. 4728sjuiz
4. 7ACC0B75
5. 92ghd48xe
6. iurmer28
7. s294hoei
8. j2719eertu2
9. lkin27js09b
10. li832lin23
[ 3 others ]
29 ( 4.5%)
103 (15.9%)
43 ( 6.6%)
28 ( 4.3%)
56 ( 8.7%)
0 ( 0.0%)
88 (13.6%)
78 (12.1%)
43 ( 6.6%)
51 ( 7.9%)
128 (19.8%)
647
(87.1%)
96
(12.9%)
4 TASK
[factor]
1. static
2. ixn
392 (53.4%)
342 (46.6%)
734
(98.8%)
9
(1.2%)
5 DATASET
[factor]
1. happiness
2. space
420 (57.2%)
314 (42.8%)
734
(98.8%)
9
(1.2%)
6 data_order
[factor]
1. space-first
2. happiness-first
3. NA
300 (40.4%)
434 (58.4%)
9 ( 1.2%)
743
(100.0%)
0
(0.0%)
7 ixn
[factor]
1. FALSE
2. TRUE
634 (85.3%)
109 (14.7%)
743
(100.0%)
0
(0.0%)
8 top_code
[factor]
1. PROCESS
2. DATASET
3. VARIABLE
4. RELATIONSHIP
160 (21.5%)
176 (23.7%)
122 (16.4%)
285 (38.4%)
743
(100.0%)
0
(0.0%)
9 mid_code
[factor]
1. distribution (continuous
2. distribution (categorical
3. relationship (categorical
4. relationship (categorical
5. relationship (continuous
6. relationship (multivariat
77 (18.0%)
54 (12.6%)
28 ( 6.6%)
55 (12.9%)
146 (34.2%)
67 (15.7%)
427
(57.5%)
316
(42.5%)
10 low_code
[factor]
1. data orientation
2. data provenance
3. data size
4. distribution outlier (var
5. distribution range [min,
6. distribution shape [shape
7. distribution variance (sd
8. missing data
9. outlier (relationship)
10. plan of action
[ 8 others ]
16 ( 2.2%)
11 ( 1.5%)
9 ( 1.2%)
9 ( 1.2%)
33 ( 4.4%)
79 (10.6%)
1 ( 0.1%)
76 (10.2%)
20 ( 2.7%)
52 ( 7.0%)
437 (58.8%)
743
(100.0%)
0
(0.0%)
11 repns
[character]
1. scatterplot
2. profile
3. none
4. dataframe
5. Multi-view Chart
6. data_dictionary
7. pairplot
8. lineplot
9. describe
10. double-profiler
[ 14 others ]
128 (17.2%)
107 (14.4%)
105 (14.1%)
74 (10.0%)
59 ( 7.9%)
56 ( 7.5%)
50 ( 6.7%)
36 ( 4.8%)
23 ( 3.1%)
23 ( 3.1%)
82 (11.0%)
743
(100.0%)
0
(0.0%)
12 timestamp
[hms, difftime]
min : 868
med : 70710
max : 215160
units : secs
622 distinct values 738
(99.3%)
5
(0.7%)
13 utterance
[character]
1. [Talking about the profil
2. actually, let me see if p
3. Although we have like les
4. And are they within range
5. And confidence in governm
6. And just I want to see ho
7. And so it looks like it s
8. And then if I had more ti
9. Because it does seem like
10. Data frame. Got a bunch o
[ 652 others ]
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
2 ( 0.3%)
723 (97.3%)
743
(100.0%)
0
(0.0%)

2 UTTERANCES

How many utterances have we collected and coded?

#COUNTS 
n_rows <- df_insights %>% nrow()
n_unique <- nlevels(df_insights$uid)
n_participants <- nlevels(df_insights$pid)

#count number of codes per unique utterance
s <- df_insights %>% group_by(uid) %>% 
    dplyr::summarise(
      count = n()
) %>% arrange(desc(count), .by_group = TRUE)

max_codes <- max(s$count)

#display frequencies
(f <- freq(s$count,
     order    = "freq",
     rows     = 1:10,
     headings = FALSE))
## There are only 2 rows to show; higher numbers will be ignored
## 
##               Freq   % Valid   % Valid Cum.   % Total   % Total Cum.
## ----------- ------ --------- -------------- --------- --------------
##           1    581     87.76          87.76     87.76          87.76
##           2     81     12.24         100.00     12.24         100.00
##        <NA>      0                               0.00         100.00
##       Total    662    100.00         100.00    100.00         100.00
coded_single <- f[1,1]
coded_double <- f[2,1]

There are 743 coded utterances, representing 662 unique statements made by 13 in the study. 581 utterances were single-coded, while 81 utterances received two detail codes. No more than 2 were applied to any single utterance.

2.1 NUMBER OF UTTERANCES

What factors affect how many utterances were produced by participants?

2.1.1 BY Factors

#DEFINE DATAFRAME
df <- df_insights %>% select(pid, uid, TASK, DATASET, data_order, top_code, low_code) 

#SUMMARY TABLE
title = "Utterances by TASK and DATASET"
cols = c("Static Task","Interactive Task","Total Utterances")
cont <- table(df$DATASET, df$TASK)
cont %>% addmargins() %>% kbl(caption = title, col.names = cols) %>%  kable_classic()
Utterances by TASK and DATASET
Static Task Interactive Task Total Utterances
happiness 256 164 420
space 136 178 314
Sum 392 342 734
#MOSAIC PLOTS
# vcd::mosaic(main="Proportion of Utterances by TASK and DATASET",
#             data = df_raw, TASK ~ DATASET, rot_labels=c(0,90,0,0),
#             offset_varnames = c(left = 4.5), offset_labels = c(left = -0.5),just_labels = "right",
#             spacing = spacing_dimequal(unit(1:2, "lines")))
 
# mosaic(formula = ~DATASET + TASK,
#        data = df,
#        main = "Proportion of Utterances by TASK and DATASET",
#        sub = "u = 734 coded utterances",
#        labeling = labeling_values,
#        labeling_args = list(set_varnames = c(graph = "TASK",
#                             datset = "DATASET")))

#DF SUMMARIZED BY TASK + DATASET
df_summary <- df %>% 
  group_by(TASK,DATASET) %>% 
  dplyr::summarise(
    c = n()
  )
## `summarise()` has grouped output by 'TASK'. You can override using the
## `.groups` argument.
#STACKED BAR BY TASK
ggplot(df_summary, aes(x = TASK, y=c, fill= DATASET)) + 
  geom_col() + 
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  labs(title = "Utterances by TASK and DATASET")

#STACKED BAR BY DATASET
ggplot(df_summary, aes(x = DATASET, y=c, fill= TASK)) + 
  geom_col() + 
  scale_fill_brewer(type="qual", palette = 1) +
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") 

2.1.2 By Participant

#DEFINE DATAFRAME
df <- df_insights %>% select(pid, uid, TASK, DATASET, data_order, top_code, low_code) 

#SUMMARY TABLE
title = "Utterances by Participant and TASK"
cols = c("Static Task","Interactive Task","Total Utterances")
cont <- table(df$pid, df$TASK)
cont %>% addmargins() %>% kbl(caption = title, col.names = cols) %>%  kable_classic()
Utterances by Participant and TASK
Static Task Interactive Task Total Utterances
bjs827ee1u 11 18 29
3r2sh20ei 63 40 103
4728sjuiz 30 13 43
7ACC0B75 18 10 28
92ghd48xe 28 28 56
iurmer28 0 0 0
s294hoei 60 28 88
j2719eertu2 32 46 78
lkin27js09b 26 17 43
li832lin23 17 34 51
7382kwtue 24 30 54
E1D39056 10 15 25
8v892iige 27 22 49
Sum 346 301 647
#SUMMARY TABLE
title = "Utterances by Participant and DATASET"
cols = c("Happiness","Space","Total Utterances")
cont <- table(df$pid, df$DATASET)
cont %>% addmargins() %>% kbl(caption = title, col.names = cols) %>%  kable_classic()
Utterances by Participant and DATASET
Happiness Space Total Utterances
bjs827ee1u 11 18 29
3r2sh20ei 63 40 103
4728sjuiz 30 13 43
7ACC0B75 18 10 28
92ghd48xe 28 28 56
iurmer28 0 0 0
s294hoei 60 28 88
j2719eertu2 46 32 78
lkin27js09b 17 26 43
li832lin23 34 17 51
7382kwtue 30 24 54
E1D39056 15 10 25
8v892iige 22 27 49
Sum 374 273 647
#VISUALIZE TASK+DATASET FACET BY PARTICIPANTS
gf_bar( ~ TASK, fill = ~DATASET, data = df) %>% 
gf_facet_grid(~pid)

#UTTERANCES by PARTICPANT and TASK (horizontal)
gf_bar(  pid ~ uid , fill = ~ TASK, data = df) +
# %>%   gf_facet_grid(.~TASK) +
  labs(
    title = "Number of Utterances by Participant and Task",
    subtitle = "some participants were far more talkative than others",
    x = "number of coded utterances",
    y = "participant",
    fill = "Analysis Task"
  )

#UTTERANCES by PARTICPANT and DATASET (horizontal)
gf_bar(  pid ~ uid , fill = ~ DATASET, data = df) + 
  # %>% gf_facet_grid(.~DATASET) +
  labs(
    title = "Number of Utterances by Participant and Dataset",
    subtitle = "Nominal outcome variable (happiness) tended to yield more utterances",
    x = "number of coded utterances",
    y = "participant",
    fill = "Dataset"
  )

#UTTERANCES by PARTICPANT and DATASET (horizontal)
gf_bar(  pid ~ uid , fill = ~ DATASET, data = df) %>% 
  gf_facet_grid(.~TASK) + 
  labs(
    title = "Number of Utterances by Participant, Dataset and Task",
    subtitle = "",
    x = "number of coded utterances",
    y = "participant",
    fill = "DATASET"
  )

2.2 KINDS OF UTTERANCES

2.2.1 BY Factors

#SUMMARIZED DF
df_summary <- df %>% mutate(
  top_code = fct_rev(top_code)) %>%  #reorder
  select( TASK, DATASET, top_code) %>% 
  group_by(TASK,DATASET, top_code) %>% 
  dplyr::summarise( 
    c = n()
  )
## `summarise()` has grouped output by 'TASK', 'DATASET'. You can override using
## the `.groups` argument.
#SUMMARY TABLE
title = "CODED Utterances by TYPE and DATASET"
cols = c("Static Task","Interactive Task","Total Utterances")
cont <- table(df$top_code, df$DATASET, df$TASK)
cont %>% addmargins() 
## , ,  = static
## 
##               
##                happiness space Sum
##   PROCESS             70    20  90
##   DATASET             53    44  97
##   VARIABLE            51    25  76
##   RELATIONSHIP        82    47 129
##   Sum                256   136 392
## 
## , ,  = ixn
## 
##               
##                happiness space Sum
##   PROCESS             29    38  67
##   DATASET             27    48  75
##   VARIABLE             8    37  45
##   RELATIONSHIP       100    55 155
##   Sum                164   178 342
## 
## , ,  = Sum
## 
##               
##                happiness space Sum
##   PROCESS             99    58 157
##   DATASET             80    92 172
##   VARIABLE            59    62 121
##   RELATIONSHIP       182   102 284
##   Sum                420   314 734
# %>% kbl(caption = title, col.names = cols) %>%  kable_classic()

#CODED UTTERANCES BAR BY TASK and DATASET
(p_coded_utterances <- 
  ggplot(df_summary, aes(x = TASK, y=c, fill= top_code)) + 
  geom_col() + 
  facet_grid(.~df_summary$DATASET)+
  geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
  scale_fill_brewer(type="qual", palette = 3) +  
  labs(title = "CODED Utterances by TASK and DATASET"))

2.2.2 By Participant

#UTTERANCES by PARTICPANT, TASK, and DATASET (horizontal)
#FACETED BY PARTICIPANT AND TOP CODE 
gf_bar(  TASK ~ uid , fill = ~ DATASET, data = df_insights) %>%  
  gf_facet_grid(top_code ~ pid) + 
  labs(
    title = "High Level Utterances by Participant, Dataset and Dataset",
    subtitle = "",
    x = "number of utterances",
    y = "Analysis Task",
    fill = "Dataset"
  )
## Warning: Ignoring unknown aesthetics: .

#REMOVE NAS
d <- df %>% na.omit()

#TOP CODE by PARTICIPANT & TASK
ggplot (d) +
  geom_bar(aes(y = pid, fill = top_code)) +
  facet_wrap(d$TASK)+
  scale_fill_brewer(type="qual", palette = 3) +
  theme_minimal() + 
  labs(title = "TOP-CODE by TASK")

#TOP CODE by PARTICIPANT & TASK
ggplot (d) +
  geom_bar(aes(y = pid, fill = top_code)) +
  facet_wrap(d$DATASET)+
  scale_fill_brewer(type="qual", palette = 3) +
  theme_minimal() + 
  labs(title = "TOP-CODE by DATASET")

#HACK BIDIRECTIONAL BAR CHART
#SUMMARIZED == reverse static to make bidirectional
# df_summ <- df %>% group_by(pid,TASK,DATASET,top_code,data_order) %>% 
#   dplyr::summarise(
#     n_utterances = n()
#   ) %>% mutate(
#     adj_utterances = ifelse(TASK=="static", (n_utterances*-1), (n_utterances*1))
#   ) %>% arrange(data_order) %>% na.omit()
#   
#  #BIDIRECTIONAL REVERSED
#  ggplot (df_summ, aes(x=adj_utterances, y=pid, fill = top_code)) + 
#   geom_bar(stat="identity") + 
#   facet_wrap(~TASK) +
#   scale_fill_brewer(type="qual", palette = 3) +
#   theme_minimal()
  
# df_hap_first <- df %>% filter(data_order == "happiness-first")
# df_space_first <- df %>% filter(data_order == "space-first")
# 
# top <- gf_bar( pid ~ ., fill = ~top_code, data = df_hap_first) %>%
#   gf_facet_grid(. ~ TASK) +
#   scale_fill_brewer(type="qual", palette = 3)+
#   theme_minimal()
# 
# bottom <- gf_bar( pid ~ ., fill = ~top_code, data = df_space_first) %>%
#   gf_facet_grid(. ~ TASK) +
#   scale_fill_brewer(type="qual", palette = 3)+
#   theme_minimal()
# 
# top
# bottom
df_summary <- df %>% 
group_by(TASK,DATASET, low_code) %>% 
  dplyr::summarise( 
    c = n()
  )
## `summarise()` has grouped output by 'TASK', 'DATASET'. You can override using
## the `.groups` argument.
#SUMMARY TABLE
print("DETAIL-CODES TYPE and DATASET") 
## [1] "DETAIL-CODES TYPE and DATASET"
table(df$low_code, df$TASK, df$DATASET) %>% addmargins()
## , ,  = happiness
## 
##                                                     
##                                                      static ixn Sum
##   data orientation                                        2   2   4
##   data provenance                                         3   5   8
##   data size                                               4   1   5
##   distribution outlier (variable)                         4   0   4
##   distribution range [min, max]                          25   3  28
##   distribution shape [shape, skew, kurtosis]             22   4  26
##   distribution variance (sd, var)                         0   1   1
##   missing data                                           23  14  37
##   outlier (relationship)                                  6  12  18
##   plan of action                                         20  10  30
##   relationship cluster(s)/subgroup/ unexpected            7  16  23
##   relationship existence / non-existence                  8   6  14
##   relationship faceted distribution characterization      0   0   0
##   relationship form (linearity/non-linearity)             7   8  15
##   relationship range constriction                         2   3   5
##   relationship strength and/or direction                 52  55 107
##   representation comment                                 50  19  69
##   variable metadata                                      21   5  26
##   Sum                                                   256 164 420
## 
## , ,  = space
## 
##                                                     
##                                                      static ixn Sum
##   data orientation                                        5   5  10
##   data provenance                                         1   2   3
##   data size                                               3   1   4
##   distribution outlier (variable)                         3   2   5
##   distribution range [min, max]                           1   4   5
##   distribution shape [shape, skew, kurtosis]             21  31  52
##   distribution variance (sd, var)                         0   0   0
##   missing data                                           23  14  37
##   outlier (relationship)                                  0   1   1
##   plan of action                                         11   9  20
##   relationship cluster(s)/subgroup/ unexpected            3   3   6
##   relationship existence / non-existence                  8   9  17
##   relationship faceted distribution characterization      0   3   3
##   relationship form (linearity/non-linearity)             0   0   0
##   relationship range constriction                         2   2   4
##   relationship strength and/or direction                 34  37  71
##   representation comment                                  9  29  38
##   variable metadata                                      12  26  38
##   Sum                                                   136 178 314
## 
## , ,  = Sum
## 
##                                                     
##                                                      static ixn Sum
##   data orientation                                        7   7  14
##   data provenance                                         4   7  11
##   data size                                               7   2   9
##   distribution outlier (variable)                         7   2   9
##   distribution range [min, max]                          26   7  33
##   distribution shape [shape, skew, kurtosis]             43  35  78
##   distribution variance (sd, var)                         0   1   1
##   missing data                                           46  28  74
##   outlier (relationship)                                  6  13  19
##   plan of action                                         31  19  50
##   relationship cluster(s)/subgroup/ unexpected           10  19  29
##   relationship existence / non-existence                 16  15  31
##   relationship faceted distribution characterization      0   3   3
##   relationship form (linearity/non-linearity)             7   8  15
##   relationship range constriction                         4   5   9
##   relationship strength and/or direction                 86  92 178
##   representation comment                                 59  48 107
##   variable metadata                                      33  31  64
##   Sum                                                   392 342 734
# #CODED UTTERANCES BAR BY TASK and DATASET
# # (p_coded_utterances <- 
#   ggplot(df_summary, aes(x = TASK, y=c, fill= low_code)) + 
#   geom_col() + 
#   facet_grid(.~df_summary$DATASET)+
#   # geom_text(aes(label=c), size = 3, hjust = 0.5, vjust = 1.5, position = "stack") + 
#   scale_fill_brewer(type="qual", palette = 3) +  
#   labs(title = "CODED Utterances by TASK and DATASET")
#   # )
# 
# 

2.3 PROCESS

df_process <- df %>% filter(top_code == "PROCESS")


summ_process <- df_process %>% 
group_by(TASK,DATASET, low_code) %>% 
  dplyr::summarise( 
    c = n()
  )
## `summarise()` has grouped output by 'TASK', 'DATASET'. You can override using
## the `.groups` argument.
#BY TASK
gf_bar(df_process, ~TASK, fill = ~low_code)

#BY PARTICIPANT
gf_bar(df_process, ~TASK, fill = ~low_code) %>% 
  gf_facet_grid(.~pid)

## DATASET

df_dataset <- df %>% filter(top_code == "DATASET")


summ_process <- df_dataset %>% 
group_by(TASK,DATASET, low_code) %>% 
  dplyr::summarise( 
    c = n()
  )
## `summarise()` has grouped output by 'TASK', 'DATASET'. You can override using
## the `.groups` argument.
#BY TASK
gf_bar(df_dataset, ~TASK, fill = ~low_code)

#BY PARTICIPANT
gf_bar(df_dataset, ~TASK, fill = ~low_code) %>% 
  gf_facet_grid(.~pid)

## VARIABLE

df_variable <- df %>% filter(top_code == "VARIABLE")


summ_process <- df_variable %>% 
group_by(TASK,DATASET, low_code) %>% 
  dplyr::summarise( 
    c = n()
  )
## `summarise()` has grouped output by 'TASK', 'DATASET'. You can override using
## the `.groups` argument.
#BY TASK
gf_bar(df_variable, ~TASK, fill = ~low_code)

#BY PARTICIPANT
gf_bar(df_variable, ~TASK, fill = ~low_code) %>% 
  gf_facet_grid(.~pid)

## RELATIONSHIP

df_relationship <- df %>% filter(top_code == "RELATIONSHIP")


summ_process <- df_relationship %>% 
group_by(TASK,DATASET, low_code) %>% 
  dplyr::summarise( 
    c = n()
  )
## `summarise()` has grouped output by 'TASK', 'DATASET'. You can override using
## the `.groups` argument.
#BY TASK
gf_bar(df_relationship, ~TASK, fill = ~low_code)

#BY PARTICIPANT
gf_bar(df_relationship, ~TASK, fill = ~low_code) %>% 
  gf_facet_grid(.~pid)

# MODELLING

#DEFINE DATAFRAME
df_raw <- df_insights %>% select(pid, uid, TASK, DATASET) %>% mutate(
  TASK = factor(TASK, levels = c("static", "ixn")) #reorder factor levels
) %>% na.omit()
print("WARNING: THE FOLLOWING HAVE OMMITED MISSING DATA RATHER THAN FINDING THE SOURCE")
## [1] "WARNING: THE FOLLOWING HAVE OMMITED MISSING DATA RATHER THAN FINDING THE SOURCE"
#DF SUMMARIZED BY SUBJECT
df_subject <- df_raw %>% group_by(pid, TASK, DATASET) %>% dplyr::summarise(
  n_utterances = n()
)

#VISUALIZE PARTICIPANTS
gf_bar( ~ TASK, fill = ~DATASET, data = df_raw) %>% 
gf_facet_grid(.~pid)

#VISUALIZE TOTALS
gf_bar (~ TASK, fill = ~DATASET, data = df_raw)

gf_bar (~ DATASET, fill = ~TASK, data = df_raw)

#MOSAIC PLOT
vcd::mosaic(main="Proportion of Utterances by TASK and DATASET",
            data = df_raw, TASK ~ DATASET, rot_labels=c(0,90,0,0), 
            offset_varnames = c(left = 4.5), offset_labels = c(left = -0.5),just_labels = "right",
            spacing = spacing_dimequal(unit(1:2, "lines")))

mosaic(formula = ~DATASET + TASK, 
       data = df_raw,
       main = "Proportion of Utterances by TASK and DATASET", 
       sub = "u = 734 utterance codes",
       labeling = labeling_values,
       labeling_args = list(set_varnames = c(graph = "TASK",
                            datset = "DATASET")))

2.4 UTTERANCES

How much variance in number of utterances is explained DATASET, TASK and PARTICIPANT?

2.4.1 OLS Fixed Effects Models

#NUMBER UTTERANCES predicted by DATASET + TASK --> OLS LINEAR REGRESSION
print("OLS-LM, UTTERANCES ~ DATASET + TASK")
## [1] "OLS-LM, UTTERANCES ~ DATASET + TASK"
m1 <- lm(n_utterances ~ DATASET + TASK, data = df_subject)
paste("Model")
## [1] "Model"
summ(m1)
Observations 24
Dependent variable n_utterances
Type OLS linear regression
F(2,21) 1.31
0.11
Adj. R² 0.03
Est. S.E. t val. p
(Intercept) 33.04 4.92 6.71 0.00
DATASETspace -8.42 5.68 -1.48 0.15
TASKixn -3.75 5.68 -0.66 0.52
Standard errors: OLS
paste("Partition Variance")
## [1] "Partition Variance"
anova(m1)
## Analysis of Variance Table
## 
## Response: n_utterances
##           Df Sum Sq Mean Sq F value Pr(>F)
## DATASET    1  425.0  425.04  2.1923 0.1536
## TASK       1   84.4   84.38  0.4352 0.5166
## Residuals 21 4071.5  193.88
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(m1)
##                  2.5 %    97.5 %
## (Intercept)   22.80385 43.279487
## DATASETspace -20.23828  3.404950
## TASKixn      -15.57162  8.071617
report(m1) #sanity check
## We fitted a linear model (estimated using OLS) to predict n_utterances with
## DATASET and TASK (formula: n_utterances ~ DATASET + TASK). The model explains a
## statistically not significant and weak proportion of variance (R2 = 0.11, F(2,
## 21) = 1.31, p = 0.290, adj. R2 = 0.03). The model's intercept, corresponding to
## DATASET = happiness and TASK = static, is at 33.04 (95% CI [22.80, 43.28],
## t(21) = 6.71, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically non-significant and negative
## (beta = -8.42, 95% CI [-20.24, 3.40], t(21) = -1.48, p = 0.154; Std. beta =
## -0.60, 95% CI [-1.43, 0.24])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -3.75, 95% CI [-15.57, 8.07], t(21) = -0.66, p = 0.517; Std. beta = -0.27,
## 95% CI [-1.10, 0.57])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(m1,  show.intercept = TRUE)

check_model(m1)

#NUMBER UTTERANCES predicted by DATASET X TASK --> LINEAR REGRESSION
print("OLS-LM, UTTERANCES ~ DATASET * TASK")
## [1] "OLS-LM, UTTERANCES ~ DATASET * TASK"
m2 <- lm(n_utterances ~ DATASET * TASK, data = df_subject)
paste("Model")
## [1] "Model"
summ(m2)
Observations 24
Dependent variable n_utterances
Type OLS linear regression
F(3,20) 1.01
0.13
Adj. R² 0.00
Est. S.E. t val. p
(Intercept) 35.00 5.76 6.08 0.00
DATASETspace -12.33 8.14 -1.51 0.15
TASKixn -7.67 8.14 -0.94 0.36
DATASETspace:TASKixn 7.83 11.52 0.68 0.50
Standard errors: OLS
paste("Partition Variance")
## [1] "Partition Variance"
anova(m2)
## Analysis of Variance Table
## 
## Response: n_utterances
##              Df Sum Sq Mean Sq F value Pr(>F)
## DATASET       1  425.0  425.04  2.1362 0.1594
## TASK          1   84.4   84.38  0.4240 0.5223
## DATASET:TASK  1   92.0   92.04  0.4626 0.5042
## Residuals    20 3979.5  198.98
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(m2)
##                          2.5 %    97.5 %
## (Intercept)           22.98759 47.012415
## DATASETspace         -29.32145  4.654787
## TASKixn              -24.65479  9.321453
## DATASETspace:TASKixn -16.19150 31.858163
report(m2) #sanity check
## We fitted a linear model (estimated using OLS) to predict n_utterances with
## DATASET and TASK (formula: n_utterances ~ DATASET * TASK). The model explains a
## statistically not significant and moderate proportion of variance (R2 = 0.13,
## F(3, 20) = 1.01, p = 0.410, adj. R2 = 9.90e-04). The model's intercept,
## corresponding to DATASET = happiness and TASK = static, is at 35.00 (95% CI
## [22.99, 47.01], t(20) = 6.08, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically non-significant and negative
## (beta = -12.33, 95% CI [-29.32, 4.65], t(20) = -1.51, p = 0.146; Std. beta =
## -0.87, 95% CI [-2.08, 0.33])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -7.67, 95% CI [-24.65, 9.32], t(20) = -0.94, p = 0.358; Std. beta = -0.54,
## 95% CI [-1.75, 0.66])
##   - The effect of DATASET [space] × TASK [ixn] is statistically non-significant
## and positive (beta = 7.83, 95% CI [-16.19, 31.86], t(20) = 0.68, p = 0.504;
## Std. beta = 0.56, 95% CI [-1.15, 2.26])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(m2,  show.intercept = TRUE)

check_model(m2)

2.4.2 POISSON Fixed Effects Models

#NUMBER UTTERANCES predicted by DATASET + TASK --> POISSON DISTRIBUTION
print("GLM-POISSON, UTTERANCES ~ DATASET + TASK")
## [1] "GLM-POISSON, UTTERANCES ~ DATASET + TASK"
p.1 <- glm(n_utterances ~ DATASET + TASK, data = df_subject, family = "poisson")
paste("Model")
## [1] "Model"
summ(p.1)
Observations 24
Dependent variable n_utterances
Type Generalized linear model
Family poisson
Link log
𝛘²(2) 18.96
Pseudo-R² (Cragg-Uhler) 0.55
Pseudo-R² (McFadden) 0.07
AIC 264.37
BIC 267.90
Est. S.E. z val. p
(Intercept) 3.51 0.06 55.32 0.00
DATASETspace -0.31 0.08 -3.95 0.00
TASKixn -0.14 0.08 -1.77 0.08
Standard errors: MLE
paste("Partition Variance")
## [1] "Partition Variance"
anova(p.1)
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: n_utterances
## 
## Terms added sequentially (first to last)
## 
## 
##         Df Deviance Resid. Df Resid. Dev
## NULL                       23     156.94
## DATASET  1  15.8313        22     141.11
## TASK     1   3.1324        21     137.98
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(p.1)
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept)   3.3802256  3.62876668
## DATASETspace -0.4715199 -0.15931757
## TASKixn      -0.2941654  0.01495392
report(p.1) #sanity check
## We fitted a poisson model (estimated using ML) to predict n_utterances with
## DATASET and TASK (formula: n_utterances ~ DATASET + TASK). The model's
## explanatory power is substantial (Nagelkerke's R2 = 0.55). The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 3.51
## (95% CI [3.38, 3.63], p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically significant and negative (beta
## = -0.31, 95% CI [-0.47, -0.16], p < .001; Std. beta = -0.31, 95% CI [-0.47,
## -0.16])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -0.14, 95% CI [-0.29, 0.01], p = 0.077; Std. beta = -0.14, 95% CI [-0.29,
## 0.01])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
plot_model(p.1,show.intercept = TRUE)

check_model(p.1)

#NUMBER UTTERANCES predicted by DATASET * TASK --> POISSON DISTRIBUTION
print("GLM-POISSON, UTTERANCES ~ DATASET X TASK")
## [1] "GLM-POISSON, UTTERANCES ~ DATASET X TASK"
p.2 <- glm(n_utterances ~ DATASET * TASK, data = df_subject, family = "poisson")
paste("Model")
## [1] "Model"
summ(p.2)
Observations 24
Dependent variable n_utterances
Type Generalized linear model
Family poisson
Link log
𝛘²(3) 21.51
Pseudo-R² (Cragg-Uhler) 0.59
Pseudo-R² (McFadden) 0.08
AIC 263.83
BIC 268.54
Est. S.E. z val. p
(Intercept) 3.56 0.07 51.52 0.00
DATASETspace -0.43 0.11 -3.95 0.00
TASKixn -0.25 0.10 -2.37 0.02
DATASETspace:TASKixn 0.25 0.16 1.59 0.11
Standard errors: MLE
paste("Partition Variance")
## [1] "Partition Variance"
anova(p.2)
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: n_utterances
## 
## Terms added sequentially (first to last)
## 
## 
##              Df Deviance Resid. Df Resid. Dev
## NULL                            23     156.94
## DATASET       1  15.8313        22     141.11
## TASK          1   3.1324        21     137.98
## DATASET:TASK  1   2.5434        20     135.44
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(p.2)
## Waiting for profiling to be done...
##                            2.5 %     97.5 %
## (Intercept)           3.41697480  3.6876129
## DATASETspace         -0.65205770 -0.2201745
## TASKixn              -0.45252318 -0.0436738
## DATASETspace:TASKixn -0.05827406  0.5681766
report(p.2) #sanity check
## We fitted a poisson model (estimated using ML) to predict n_utterances with
## DATASET and TASK (formula: n_utterances ~ DATASET * TASK). The model's
## explanatory power is substantial (Nagelkerke's R2 = 0.59). The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 3.56
## (95% CI [3.42, 3.69], p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically significant and negative (beta
## = -0.43, 95% CI [-0.65, -0.22], p < .001; Std. beta = -0.43, 95% CI [-0.65,
## -0.22])
##   - The effect of TASK [ixn] is statistically significant and negative (beta =
## -0.25, 95% CI [-0.45, -0.04], p = 0.018; Std. beta = -0.25, 95% CI [-0.45,
## -0.04])
##   - The effect of DATASET [space] × TASK [ixn] is statistically non-significant
## and positive (beta = 0.25, 95% CI [-0.06, 0.57], p = 0.111; Std. beta = 0.25,
## 95% CI [-0.06, 0.57])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
plot_model(p.2,show.intercept = TRUE)

check_model(p.2)

2.4.3 OLS Mixed Effects Models

#NUMBER UTTERANCES predicted by DATASET + TASK | participatnt--> MIXED LINEAR REGRESSION
print("LMER, UTTERANCES ~ DATASET + TASK")
## [1] "LMER, UTTERANCES ~ DATASET + TASK"
mm1 <- lmer(n_utterances ~ DATASET + TASK+ (1|pid), data = df_subject)
paste("Model")
## [1] "Model"
summ(mm1)
Observations 24
Dependent variable n_utterances
Type Mixed effects linear regression
AIC 182.77
BIC 188.66
Pseudo-R² (fixed effects) 0.11
Pseudo-R² (total) 0.62
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 33.04 4.36 7.57 19.70 0.00
DATASETspace -8.42 3.64 -2.31 10.00 0.04
TASKixn -3.75 3.64 -1.03 10.00 0.33
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 10.45
Residual 8.91
Grouping Variables
Group # groups ICC
pid 12 0.58
paste("Partition Variance")
## [1] "Partition Variance"
anova(mm1)
## Type III Analysis of Variance Table with Satterthwaite's method
##         Sum Sq Mean Sq NumDF DenDF F value  Pr(>F)  
## DATASET 425.04  425.04     1    10  5.3526 0.04324 *
## TASK     84.37   84.37     1    10  1.0625 0.32692  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(mm1)
## Computing profile confidence intervals ...
##                   2.5 %    97.5 %
## .sig01         4.419633 17.646217
## .sigma         5.714441 12.903457
## (Intercept)   24.600922 41.482411
## DATASETspace -15.483056 -1.350277
## TASKixn      -10.816390  3.316389
report(mm1) #sanity check
## We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
## to predict n_utterances with DATASET and TASK (formula: n_utterances ~ DATASET
## + TASK). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.62) and the
## part related to the fixed effects alone (marginal R2) is of 0.11. The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 33.04
## (95% CI [23.91, 42.17], t(19) = 7.57, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically significant and negative (beta
## = -8.42, 95% CI [-16.03, -0.80], t(19) = -2.31, p = 0.032; Std. beta = -0.60,
## 95% CI [-1.14, -0.06])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -3.75, 95% CI [-11.36, 3.86], t(19) = -1.03, p = 0.316; Std. beta = -0.27,
## 95% CI [-0.81, 0.27])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(mm1,  show.intercept = TRUE)

check_model(mm1)

#NUMBER UTTERANCES predicted by DATASET * TASK  | participatnt--> MIXED LINEAR REGRESSION
print("LMER, UTTERANCES ~ DATASET X TASK")
## [1] "LMER, UTTERANCES ~ DATASET X TASK"
mm2 <- lmer(n_utterances ~ DATASET * TASK + (1|pid), data = df_subject)
paste("Model")
## [1] "Model"
summ(mm2)
Observations 24
Dependent variable n_utterances
Type Mixed effects linear regression
AIC 177.31
BIC 184.38
Pseudo-R² (fixed effects) 0.12
Pseudo-R² (total) 0.65
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 35.00 5.76 6.08 14.69 0.00
DATASETspace -12.33 8.14 -1.51 14.69 0.15
TASKixn -7.67 8.14 -0.94 14.69 0.36
DATASETspace:TASKixn 7.83 14.57 0.54 10.00 0.60
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 10.93
Residual 8.91
Grouping Variables
Group # groups ICC
pid 12 0.60
paste("Partition Variance")
## [1] "Partition Variance"
anova(mm2)
## Type III Analysis of Variance Table with Satterthwaite's method
##              Sum Sq Mean Sq NumDF DenDF F value  Pr(>F)  
## DATASET      425.04  425.04     1    10  5.3526 0.04324 *
## TASK          84.37   84.37     1    10  1.0625 0.32692  
## DATASET:TASK  22.94   22.94     1    10  0.2889 0.60265  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(mm2)
## Computing profile confidence intervals ...
##                           2.5 %    97.5 %
## .sig01                 4.150279 17.371325
## .sigma                 5.714460 12.902464
## (Intercept)           24.084665 45.915336
## DATASETspace         -27.769948  3.103282
## TASKixn              -23.103281  7.769949
## DATASETspace:TASKixn -20.472642 36.139309
report(mm2) #sanity check
## We fitted a linear mixed model (estimated using REML and nloptwrap optimizer)
## to predict n_utterances with DATASET and TASK (formula: n_utterances ~ DATASET
## * TASK). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.65) and the
## part related to the fixed effects alone (marginal R2) is of 0.12. The model's
## intercept, corresponding to DATASET = happiness and TASK = static, is at 35.00
## (95% CI [22.90, 47.10], t(18) = 6.08, p < .001). Within this model:
## 
##   - The effect of DATASET [space] is statistically non-significant and negative
## (beta = -12.33, 95% CI [-29.44, 4.78], t(18) = -1.51, p = 0.147; Std. beta =
## -0.87, 95% CI [-2.09, 0.34])
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -7.67, 95% CI [-24.78, 9.44], t(18) = -0.94, p = 0.359; Std. beta = -0.54,
## 95% CI [-1.76, 0.67])
##   - The effect of DATASET [space] × TASK [ixn] is statistically non-significant
## and positive (beta = 7.83, 95% CI [-22.78, 38.45], t(18) = 0.54, p = 0.597;
## Std. beta = 0.56, 95% CI [-1.61, 2.72])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald t-distribution approximation.
plot_model(mm2,  show.intercept = TRUE)

check_model(mm2)

2.4.4 POISSON Mixed Effects Models

#NUMBER UTTERANCES predicted by TASK + DATASET  | participatnt--> POISSON MIXED LINEAR REGRESSION
print("POISSON-MER, UTTERANCES ~ DATASET + TASK")
## [1] "POISSON-MER, UTTERANCES ~ DATASET + TASK"
pmm1 <- glmer(n_utterances ~ TASK + DATASET + (1|pid), data = df_subject, family = "poisson")
paste("Model")
## [1] "Model"
summ(pmm1)
Observations 24
Dependent variable n_utterances
Type Mixed effects generalized linear model
Family poisson
Link log
AIC 189.26
BIC 193.97
Pseudo-R² (fixed effects) 0.12
Pseudo-R² (total) 0.83
Fixed Effects
Est. S.E. z val. p
(Intercept) 3.41 0.13 25.59 0.00
TASKixn -0.12 0.08 -1.52 0.13
DATASETspace -0.31 0.08 -3.87 0.00
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 0.40
Grouping Variables
Group # groups ICC
pid 12 0.14
paste("Partition Variance")
## [1] "Partition Variance"
anova(pmm1)
## Analysis of Variance Table
##         npar  Sum Sq Mean Sq F value
## TASK       1  2.9523  2.9523  2.9523
## DATASET    1 14.8233 14.8233 14.8233
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(pmm1)
## Computing profile confidence intervals ...
##                   2.5 %      97.5 %
## .sig01        0.2637590  0.66629252
## (Intercept)   3.1288524  3.68682117
## TASKixn      -0.2776159  0.03519974
## DATASETspace -0.4646941 -0.15157217
report(pmm1) #sanity check
## We fitted a poisson mixed model (estimated using ML and Nelder-Mead optimizer)
## to predict n_utterances with TASK and DATASET (formula: n_utterances ~ TASK +
## DATASET). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.83) and the
## part related to the fixed effects alone (marginal R2) is of 0.12. The model's
## intercept, corresponding to TASK = static and DATASET = happiness, is at 3.41
## (95% CI [3.15, 3.67], p < .001). Within this model:
## 
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -0.12, 95% CI [-0.28, 0.03], p = 0.128; Std. beta = -0.12, 95% CI [-0.28,
## 0.03])
##   - The effect of DATASET [space] is statistically significant and negative (beta
## = -0.31, 95% CI [-0.46, -0.15], p < .001; Std. beta = -0.31, 95% CI [-0.46,
## -0.15])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
plot_model(pmm1,  show.intercept = TRUE)

check_model(pmm1)

#NUMBER UTTERANCES predicted by TASK X DATASET  | participatnt--> POISSON MIXED LINEAR REGRESSION
print("POISSON-MER, UTTERANCES ~ DATASET X TASK")
## [1] "POISSON-MER, UTTERANCES ~ DATASET X TASK"
pmm2 <- glmer(n_utterances ~ TASK * DATASET + (1|pid), data = df_subject, family = "poisson")
paste("Model")
## [1] "Model"
summ(pmm2)
Observations 24
Dependent variable n_utterances
Type Mixed effects generalized linear model
Family poisson
Link log
AIC 191.18
BIC 197.07
Pseudo-R² (fixed effects) 0.13
Pseudo-R² (total) 0.83
Fixed Effects
Est. S.E. z val. p
(Intercept) 3.45 0.18 19.19 0.00
TASKixn -0.19 0.26 -0.74 0.46
DATASETspace -0.38 0.26 -1.46 0.14
TASKixn:DATASETspace 0.14 0.49 0.28 0.78
Random Effects
Group Parameter Std. Dev.
pid (Intercept) 0.40
Grouping Variables
Group # groups ICC
pid 12 0.14
paste("Partition Variance")
## [1] "Partition Variance"
anova(pmm2)
## Analysis of Variance Table
##              npar  Sum Sq Mean Sq F value
## TASK            1  3.0017  3.0017  3.0017
## DATASET         1 14.8190 14.8190 14.8190
## TASK:DATASET    1  0.0783  0.0783  0.0783
paste("Confidence Interval on Parameter Estimates")
## [1] "Confidence Interval on Parameter Estimates"
confint(pmm2)
## Computing profile confidence intervals ...
##                           2.5 %    97.5 %
## .sig01                0.2618041 0.6637123
## (Intercept)           3.0601090 3.8198648
## TASKixn              -0.7259280 0.3541708
## DATASETspace         -0.9171433 0.1706822
## TASKixn:DATASETspace -0.9208727 1.1837182
report(pmm2) #sanity check
## We fitted a poisson mixed model (estimated using ML and Nelder-Mead optimizer)
## to predict n_utterances with TASK and DATASET (formula: n_utterances ~ TASK *
## DATASET). The model included pid as random effect (formula: ~1 | pid). The
## model's total explanatory power is substantial (conditional R2 = 0.83) and the
## part related to the fixed effects alone (marginal R2) is of 0.13. The model's
## intercept, corresponding to TASK = static and DATASET = happiness, is at 3.45
## (95% CI [3.10, 3.80], p < .001). Within this model:
## 
##   - The effect of TASK [ixn] is statistically non-significant and negative (beta
## = -0.19, 95% CI [-0.69, 0.31], p = 0.460; Std. beta = -0.19, 95% CI [-0.69,
## 0.31])
##   - The effect of DATASET [space] is statistically non-significant and negative
## (beta = -0.38, 95% CI [-0.88, 0.13], p = 0.145; Std. beta = -0.38, 95% CI
## [-0.88, 0.13])
##   - The effect of TASK [ixn] × DATASET [space] is statistically non-significant
## and positive (beta = 0.14, 95% CI [-0.83, 1.11], p = 0.780; Std. beta = 0.14,
## 95% CI [-0.83, 1.11])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.
plot_model(pmm2,  show.intercept = TRUE)

check_model(pmm2)

3 TODO REPRESENTATIONS

#COUNTS 
# n_rows <- df_insights %>% nrow()
# n_unique <- nlevels(df_insights$uid)
# n_participants <- nlevels(df_insights$pid)
# 
# #count number of codes per unique utterance
# s <- df_insights %>% group_by(uid) %>% 
#     dplyr::summarise(
#       count = n()
# ) %>% arrange(desc(count), .by_group = TRUE)
# 
# max_codes <- max(s$count)
# 
# #display frequencies
# (f <- freq(s$count,
#      order    = "freq",
#      rows     = 1:10,
#      headings = FALSE))
# 
# coded_single <- f[1,1]
# coded_double <- f[2,1]